
clear all
set more off, perm

use "INTERMEDIATE/products_clean.dta", clear

keep upc upc_ver_uc upc_descr brand_code_uc brand_descr name_firm 

********************************************************************************
* 1) CLEANING AND SPLITTING DESCRIPTION IN WORDS
*clean from some symbols
replace brand_descr = subinstr(brand_descr,",","",.)
replace brand_descr = subinstr(brand_descr,"+","",.)
replace brand_descr = subinstr(brand_descr,";","",.)
replace brand_descr = subinstr(brand_descr,".","",.)
replace brand_descr = subinstr(brand_descr,"&","",.)
replace brand_descr = subinstr(brand_descr,"'","",.)
replace brand_descr = subinstr(brand_descr,"-","",.)


*splitting into pieces the brand description
split brand_descr,  gen(brand_descr_word)


********************************************************************************
* 1) NEW VARIABLES FOR EACH WORD LENGTH

*different number of words included
gen space = " "
egen brand_descr_1w = concat(brand_descr_word1)
egen brand_descr_2w = concat(brand_descr_word1 space brand_descr_word2)
egen brand_descr_3w = concat(brand_descr_word1 space brand_descr_word2 space brand_descr_word3)
egen brand_descr_4w = concat(brand_descr_word1 space brand_descr_word2 space brand_descr_word3 space brand_descr_word4)
egen brand_descr_5w = concat(brand_descr_word1 space brand_descr_word2 space brand_descr_word3 space brand_descr_word4 space brand_descr_word5)
egen brand_descr_6w = concat(brand_descr_word1 space brand_descr_word2 space brand_descr_word3 space brand_descr_word4 space brand_descr_word5 space brand_descr_word6)
egen brand_descr_7w = concat(brand_descr_word1 space brand_descr_word2 space brand_descr_word3 space brand_descr_word4 space brand_descr_word5 space brand_descr_word6 space brand_descr_word7)
egen brand_descr_8w = concat(brand_descr_word1 space brand_descr_word2 space brand_descr_word3 space brand_descr_word4 space brand_descr_word5 space brand_descr_word6 space brand_descr_word7 space brand_descr_word8)
egen brand_descr_9w = concat(brand_descr_word1 space brand_descr_word2 space brand_descr_word3 space brand_descr_word4 space brand_descr_word5 space brand_descr_word6 space brand_descr_word7 space brand_descr_word8 space brand_descr_word9)
egen brand_descr_10w = concat(brand_descr_word1 space brand_descr_word2 space brand_descr_word3 space brand_descr_word4 space brand_descr_word5 space brand_descr_word6 space brand_descr_word7 space brand_descr_word8 space brand_descr_word9 space brand_descr_word10)

 
********************************************************************************
* AUXILIARY VARIABLES 
sort name_firm brand_descr
gen id =_n

egen brand_1 = group(brand_descr_1w)
egen brand_2 = group(brand_descr_2w)
egen brand_3 = group(brand_descr_3w) 
egen brand_4 = group(brand_descr_4w) 
egen brand_5 = group(brand_descr_5w) 
egen brand_6 = group(brand_descr_6w) 
egen brand_7 = group(brand_descr_7w) 
egen brand_8 = group(brand_descr_8w) 
egen brand_9 = group(brand_descr_9w) 
egen brand_10 = group(brand_descr_10w) 

bys brand_1: egen num_brand_1 = nvals(id) 
bys brand_2: egen num_brand_2 = nvals(id) 
bys brand_3: egen num_brand_3 = nvals(id) 
bys brand_4: egen num_brand_4 = nvals(id) 
bys brand_5: egen num_brand_5 = nvals(id) 
bys brand_6: egen num_brand_6 = nvals(id) 
bys brand_7: egen num_brand_7 = nvals(id) 
bys brand_8: egen num_brand_8 = nvals(id) 
bys brand_9: egen num_brand_9 = nvals(id) 
bys brand_10: egen num_brand_10 = nvals(id) 

********************************************************************************
*pick the longest possible name for the brand such that all the osbervations within the same pair (firm, first word description) are equal

*MAIN VARIABLE- based on 2 words
gen brand_descr_main = brand_descr_2w
replace brand_descr_main = brand_descr_3w if num_brand_2 == num_brand_3
replace brand_descr_main = brand_descr_4w if num_brand_2 == num_brand_4
replace brand_descr_main = brand_descr_5w if num_brand_2 == num_brand_5
replace brand_descr_main = brand_descr_6w if num_brand_2 == num_brand_6
replace brand_descr_main = brand_descr_7w if num_brand_2 == num_brand_7
replace brand_descr_main = brand_descr_8w if num_brand_2 == num_brand_8
replace brand_descr_main = brand_descr_9w if num_brand_2 == num_brand_9
replace brand_descr_main = brand_descr_10w if num_brand_2 == num_brand_10

replace brand_descr_main = brand_descr_3w if length(brand_descr_2w) <= 3


sort brand_descr_main name_firm
egen brand_attempt_main = group(brand_descr_main name_firm)
egen brand_attempt_main_N = nvals(brand_attempt_main)

********************************************************************************

*ALTERNATIVE- based on 1 word
gen brand_descr_altern = brand_descr_1w
replace brand_descr_altern = brand_descr_2w if num_brand_1 == num_brand_2
replace brand_descr_altern = brand_descr_3w if num_brand_1 == num_brand_3
replace brand_descr_altern = brand_descr_4w if num_brand_1 == num_brand_4
replace brand_descr_altern = brand_descr_5w if num_brand_1 == num_brand_5
replace brand_descr_altern = brand_descr_6w if num_brand_1 == num_brand_6
replace brand_descr_altern = brand_descr_7w if num_brand_1 == num_brand_7
replace brand_descr_altern = brand_descr_8w if num_brand_1 == num_brand_8
replace brand_descr_altern = brand_descr_9w if num_brand_1 == num_brand_9
replace brand_descr_altern = brand_descr_10w if num_brand_1 == num_brand_10

replace brand_descr_altern = brand_descr_2w if brand_descr_word1 =="YES" | brand_descr_word1=="9TH" | brand_descr_word1=="CTL"

replace brand_descr_altern = brand_descr_2w if length(brand_descr_1w) <= 3
replace brand_descr_altern = brand_descr_3w if length(brand_descr_2w) <= 3


sort brand_descr_altern name_firm
egen brand_attempt_altern = group(brand_descr_altern name_firm)
egen brand_attempt_altern_N = nvals(brand_attempt_altern)


********************************************************************************
*original variable count
egen brand_firm_count = group(brand_descr name_firm)
egen brand_firm_N = nvals(brand_firm_count)
********************************************************************************

keep upc upc_ver_uc brand_attempt_main brand_descr_main brand_descr_altern brand_attempt_altern name_firm
save "INTERMEDIATE/products_clean_brand.dta", replace
